*** How worried should we be? The implications of fabricated survey data for political science
*** Table A4: Share of Clean and Fabricated Interviews by Region

set more off

* set directory to location of dataset in following line
cd "C:\~\Downloads\"

use "VEN_fraud_data.dta", clear

** label data for full sample comparisons
* first drop canceled cases that were not likelyfrauds and matched to clean cases
drop if clean_data == 0 & cem_matched != 1
* 1 = fake, 2 = clean matched, 3 = rest of data
gen comparison_groups = 1 if likelyfraud == 1 & cem_matched == 1
replace comparison_groups = 2 if likelyfraud == 0 & cem_matched == 1
replace comparison_groups = 3 if likelyfraud == 0 & cem_matched == 0

* double the "rest of the dataset" (non-matched clean interviews)
gen exp=1
replace exp=2 if comparison_group == 3
expand exp, gen(copy)

* compromised versus clean indicator (1 = clean, 2 = compromised)
gen clean_or_compr = .
replace clean_or_compr = 1 if comparison_group == 2 | copy == 0
replace clean_or_compr = 2 if comparison_group == 1 | copy == 1

gen clean = 1 if clean_or_compr == 1
replace clean = 0 if clean_or_compr == 2
lab define clean_comp_lab 0 "Compromised" 1 "Clean"
lab values clean clean_comp_lab

drop if clean_or_compr == 2 & clean_data == 1

** Table A4 **
tabulate estratopri, gen(estrato)
local vars estrato1 estrato2 estrato4 estrato5 estrato6 estrato7 estrato8
estpost prtest `vars', by(clean_or_compr)
esttab using venz_regions.rtf, replace cells("P_1(fmt(%12.3f)) P_2(fmt(%12.3f)) b(fmt(%12.3f)) p(fmt(%12.3f))") nonumber label varwidth(25)
